/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Hunspell, based on MySpell.
 *
 * The Initial Developers of the Original Code are
 * Kevin Hendricks (MySpell) and Németh László (Hunspell).
 * Portions created by the Initial Developers are Copyright (C) 2002-2005
 * the Initial Developers. All Rights Reserved.
 *
 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>

#include "../hunspell/csutil.hxx"
#include "xmlparser.hxx"


#ifndef W32
using namespace std;
#endif

enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };

static const char * __PATTERN__[][2] = {
	{ "<!--", "-->" },
	{ "<[cdata[", "]]>" }, // XML comment
	{ "<", ">" }
};

#define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char *) * 2))

static const char * __PATTERN2__[][2] = {
};

#define __PATTERN_LEN2__ (sizeof(__PATTERN2__) / (sizeof(char *) * 2))

#define ENTITY_APOS "&apos;"
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"

XMLParser::XMLParser()
    : pattern_num(0)
    , pattern2_num(0)
    , prevstate(0)
    , checkattr(0)
    , quotmark(0)
{
}

XMLParser::XMLParser(const char * wordchars)
    : pattern_num(0)
    , pattern2_num(0)
    , prevstate(0)
    , checkattr(0)
    , quotmark(0)
{
	init(wordchars);
}

XMLParser::XMLParser(unsigned short * wordchars, int len)
    : pattern_num(0)
    , pattern2_num(0)
    , prevstate(0)
    , checkattr(0)
    , quotmark(0)
{
	init(wordchars, len);
}

XMLParser::~XMLParser()
{
}

int XMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
{
	for (unsigned int i = 0; i < len; i++) {
		char * j = line[actual] + head;
		const char * k = p[i][column];
		while ((*k != '\0') && (tolower(*j) == *k)) {
			j++;
			k++;
		}
		if (*k == '\0') return i;
	}
	return -1;
}

/*
 * XML parser
 *
 */

char * XMLParser::next_token(const char * PATTERN[][2], unsigned int PATTERN_LEN, const char * PATTERN2[][2], unsigned int PATTERN_LEN2)
{
	const char * latin1;

	for (;;) {
		switch (state)
		{
		case ST_NON_WORD: // non word chars
			prevstate = ST_NON_WORD;
			if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
				checkattr = 0;
				if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
					checkattr = 1;
				}
				state = ST_TAG;
			} else if (is_wordchar(line[actual] + head)) {
				state = ST_WORD;
				token = head;
			} else if ((latin1 = get_latin1(line[actual] + head))) {
				state = ST_WORD;
				token = head;
				head += strlen(latin1);
			} else if (line[actual][head] == '&') {
				state = ST_CHAR_ENTITY;
			} 			
			break;
		case ST_WORD: // wordchar
			if ((latin1 = get_latin1(line[actual] + head))) {
				head += strlen(latin1);
			} else if ((is_wordchar((char *) APOSTROPHE) || (is_utf8() && is_wordchar((char *) UTF8_APOS))) &&
					strncmp(line[actual] + head, ENTITY_APOS, strlen(ENTITY_APOS)) == 0 &&
					is_wordchar(line[actual] + head + strlen(ENTITY_APOS))) {
				head += strlen(ENTITY_APOS) - 1;
			} else if (is_utf8() && is_wordchar((char *) APOSTROPHE) && // add Unicode apostrophe to the WORDCHARS, if needed
					strncmp(line[actual] + head, UTF8_APOS, strlen(UTF8_APOS)) == 0 &&
					is_wordchar(line[actual] + head + strlen(UTF8_APOS))) {
				head += strlen(UTF8_APOS) - 1;
			} else if (! is_wordchar(line[actual] + head)) {
				state = prevstate;
				char * t = alloc_token(token, &head);
				if (t) return t;
			}
			break;
		case ST_TAG: // comment, labels, etc
			int i;
			if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1)
				&& (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) {
					checkattr = 2;
			} else if ((checkattr > 0) && (line[actual][head] == '>')) {
					state = ST_NON_WORD;
			} else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) && 
				(strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) {
					state = ST_NON_WORD;
					head += strlen(PATTERN[pattern_num][1]) - 1;
			} else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
				((line[actual][head] == '"') || (line[actual][head] == '\''))) {
				quotmark = line[actual][head];
				state = ST_ATTRIB;
			}
			break;
		case ST_ATTRIB: // non word chars
			prevstate = ST_ATTRIB;
			if (line[actual][head] == quotmark) {
				state = ST_TAG;
				if (checkattr == 2) checkattr = 1;
			 // for IMG ALT
			} else if (is_wordchar(line[actual] + head) && (checkattr == 2)) {
				state = ST_WORD;
				token = head;
			} else if (line[actual][head] == '&') {
				state = ST_CHAR_ENTITY;
			} 			
			break;
		case ST_CHAR_ENTITY: // SGML element
			if ((tolower(line[actual][head]) == ';')) {
				state = prevstate;
				head--;
			}
		}
                if (next_char(line[actual], &head)) return NULL;
	}
}

char * XMLParser::next_token()
{
	return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__, __PATTERN_LEN2__);
}

int XMLParser::change_token(const char * word)
{
	if (strstr(word, APOSTROPHE) != NULL ||
	    strchr(word, '"') != NULL ||
	    strchr(word, '&') != NULL ||
	    strchr(word, '<') != NULL ||
	    strchr(word, '>') != NULL) {
		std::string r(word);
		mystrrep(r, "&", "__namp;__");
		mystrrep(r, "__namp;__", "&amp;");
		mystrrep(r, APOSTROPHE, ENTITY_APOS);
		mystrrep(r, "\"", "&quot;");
		mystrrep(r, ">", "&gt;");
		mystrrep(r, "<", "&lt;");
		return TextParser::change_token(r.c_str());
	}
	return TextParser::change_token(word);
}
